{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import lightgbm as lgb\n",
    "import xgboost as xgb\n",
    "from sklearn.preprocessing import LabelEncoder,OneHotEncoder\n",
    "from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD,SparsePCA\n",
    "from sklearn.model_selection import KFold,StratifiedKFold\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gc\n",
    "import time\n",
    "import os\n",
    "import sys\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv(\"../data/age_train.csv\",names=['uid','age_group']).sort_values(by=['uid'])\n",
    "test = pd.read_csv(\"../data/age_test.csv\",names=['uid']).sort_values(by=['uid'])\n",
    "info = pd.read_csv(\"../data/app_info.csv\",names=['appid','category'])\n",
    "active = pd.read_csv(\"../data/user_app_actived.csv\",names=['uid','appid']).sort_values(by=['uid'])\n",
    "usage = pd.read_csv(\"../data/user_app_usage.csv\",names=['uid','appid','duration','times','use_date'],parse_dates=['use_date'])\n",
    "user_basic_info = pd.read_csv(\"../data/user_basic_info.csv\",names=['uid','gender','city','prodname','ramcapacity','ramleftration','romcapacity','romleftration','color','fontsize','ct','carrier','os']).sort_values(by=['uid'])\n",
    "behavior_info = pd.read_csv(\"../data/user_behavior_info.csv\",names=['uid','boottimes','a','b','c','d','e','f','g']).sort_values(by=['uid'])\n",
    "print((train.shape,test.shape),(info.shape,active.shape,user_basic_info.shape,behavior_info.shape))#usage.shape,\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "pickle_path = \"../pickle\"\n",
    "if not os.path.exists(pickle_path):\n",
    "    os.mkdir(pickle_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "USAGE TO PICKLE:  109.59347701072693\n"
     ]
    }
   ],
   "source": [
    "if not os.path.exists(\"{}/user_app_usage.pickle\".format(pickle_path)):\n",
    "    t1 = time.time()\n",
    "    usage.to_pickle(\"{}/user_app_usage.pickle\".format(pickle_path))\n",
    "    print('USAGE TO PICKLE: ',time.time()-t1)\n",
    "\n",
    "usage_app_seq = usage[['uid','appid']].groupby(['uid'])['appid'].apply(lambda x:list(x)).reset_index()\n",
    "usage_app_seq.to_pickle(\"{}/user_app_seq.pickle\".format(pickle_path))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tqdm import tqdm\n",
    "\n",
    "def flatten_active(df):    \n",
    "    u = []\n",
    "    a = []\n",
    "    for i in tqdm(range(len(df['appid'].values))):\n",
    "        u += [df['uid'].values[i]]*df['app_len'].values[i]\n",
    "        a += list(df['appid'].values[i])\n",
    "        \n",
    "    new_df = pd.DataFrame()\n",
    "    new_df['uid'] = u\n",
    "    new_df['appid'] = a\n",
    "        \n",
    "    return new_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 4999341/4999341 [00:51<00:00, 96820.22it/s]\n"
     ]
    }
   ],
   "source": [
    "active['appid'] = active['appid'].map(lambda x:x.split('#'))\n",
    "active['app_len'] = active['appid'].map(lambda x:len(x))\n",
    "active = active.reset_index(drop=True)\n",
    "deal_active = flatten_active(active)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ACTIVE TO PICKLE:  54.16705322265625\n"
     ]
    }
   ],
   "source": [
    "if not os.path.exists(\"{}/user_app_active.pickle\".format(pickle_path)):\n",
    "    t1 = time.time()\n",
    "    active.to_pickle(\"{}/user_app_active.pickle\".format(pickle_path))\n",
    "    print('ACTIVE TO PICKLE: ',time.time()-t1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Deal ACTIVE TO PICKLE:  59.198060512542725\n"
     ]
    }
   ],
   "source": [
    "if not os.path.exists(\"{}/user_app_active_flatten.pickle\".format(pickle_path)):\n",
    "    t1 = time.time()\n",
    "    deal_active.to_pickle(\"{}/user_app_active_flatten.pickle\".format(pickle_path))\n",
    "    print('Deal ACTIVE TO PICKLE: ',time.time()-t1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "active = pd.read_pickle(\"../pickle/user_app_active.pickle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>uid</th>\n",
       "      <th>appid</th>\n",
       "      <th>app_len</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1000006</td>\n",
       "      <td>[a001012, a001036, a001062, a001172, a001275, ...</td>\n",
       "      <td>47</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1000009</td>\n",
       "      <td>[a001012, a001015, a001055, a001062, a00107, a...</td>\n",
       "      <td>73</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1000010</td>\n",
       "      <td>[a001012, a001036, a001050, a001055, a001062, ...</td>\n",
       "      <td>96</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1000011</td>\n",
       "      <td>[a001012, a001063, a002450, a003083, a00326, a...</td>\n",
       "      <td>21</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1000012</td>\n",
       "      <td>[a001036, a001062, a001580, a001583, a003570, ...</td>\n",
       "      <td>33</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       uid                                              appid  app_len\n",
       "0  1000006  [a001012, a001036, a001062, a001172, a001275, ...       47\n",
       "1  1000009  [a001012, a001015, a001055, a001062, a00107, a...       73\n",
       "2  1000010  [a001012, a001036, a001050, a001055, a001062, ...       96\n",
       "3  1000011  [a001012, a001063, a002450, a003083, a00326, a...       21\n",
       "4  1000012  [a001036, a001062, a001580, a001583, a003570, ...       33"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "active.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_data = train.append(test)\n",
    "all_data = all_data.sort_values(by=['uid']).reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "active_train = active.merge(train,how='right',on='uid')\n",
    "active_test = active.merge(test,how='right',on='uid')\n",
    "active_train.to_pickle(\"../pickle/active_text_train.pickle\")\n",
    "active_test.to_pickle(\"../pickle/active_text_test.pickle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "usage_train = usage.merge(train,how='right',on='uid')\n",
    "usage_test = usage.merge(test,how='right',on='uid')\n",
    "usage_train.to_pickle(\"../pickle/usage_text_train.pickle\")\n",
    "usage_test.to_pickle(\"../pickle/usage_text_test.pickle\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
